Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import zscore
from sklearn.cluster import KMeans
# making data frame from csv file
vehicle_df_orig = pd.read_csv("vehicle.csv")
# making new data frame with dropped NA values
vehicle_df = vehicle_df_orig.dropna(axis = 0, how ='any')
# comparing sizes of data frames
print("Old data frame length:", len(vehicle_df_orig), "\nNew data frame length:",
len(vehicle_df), "\nNumber of rows with at least 1 NA value: ",
(len(vehicle_df_orig)-len(vehicle_df)))
vehicle_df.info()
Since the variable is categorical, you can use value_counts function
vehicle_df['class'].value_counts().plot.bar(title='Vehicle Types')
print(vehicle_df['class'].value_counts())
print(vehicle_df.isnull().sum())
total_missing_values = vehicle_df.isnull().sum().sum() + vehicle_df.isna().sum().sum()
print('There are {} missing values in the vehicle database'.format(total_missing_values))
Since the dimensions of the data are not really known to us, it would be wise to standardize the data using z scores before we go for any clustering methods. You can use zscore function to do this
from scipy.stats import zscore
numeric_cols = vehicle_df.drop('class', axis=1)
vehicle_type = vehicle_df.pop("class")
numeric_cols = numeric_cols.apply(zscore)
vehicle_df = numeric_cols.join(vehicle_type) # Recreating vehicle_df by combining numerical columns with vehicle types
vehicle_df
# The below function is inspired from the article https://kite.com/blog/python/data-analysis-visualization-python/
# Checking for Outliers in the data
# IQR based outlier identification - Distplot
def IQR_based_outlier(data, threshold = 1.5):
IQR = np.quantile(data, 0.75) - np.quantile(data, 0.25)
minval = np.quantile(data,0.25) - IQR * threshold
maxval = np.quantile(data,0.75) + IQR * threshold
return (data < minval)|(data > maxval)
col_names = vehicle_df.select_dtypes(include=['float64','int64']).columns
fig, ax = plt.subplots(len(col_names), figsize=(8,200))
for i, col_val in enumerate(col_names):
x = vehicle_df[col_val][:1000]
sns.distplot(x, ax=ax[i], rug=True, hist=False)
outliers = x[IQR_based_outlier(x)]
print('List of Outliers detected for - {} \n '.format(col_val),outliers)
ax[i].plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)
ax[i].set_title('Outlier detection - {}'.format(col_val), fontsize=10)
ax[i].set_xlabel(col_val, fontsize=8)
plt.show()
# No outliers detected for the following columns(10)
# Compactness, Circularity, distance_circularity, scatter_ratio, elogatedness, axis_rectangularity
# max.length_rectangularity, sclaed_radius_of_gyration and skewness_about.2 and hollows_ratio
# Outliers detected for the following columns(8)
# radius_ratio(3), pr.axis_aspect_ratio(7), max.length_aspect_ratio(10), scaled_variance(1), scaled_variance.1(2)
# scaled_radius_of_gyration.1(10), skewness_about(3) and skewness_about.1(3)
# with univariate analysis of these columns we are looking at a minimum of 2 clusters to maximum of 6 clusters
# Bi-variate analysis and visual cue for no of clusters
sns.pairplot(vehicle_df, diag_kind = 'kde')
# Based on the Peaks, we can infer that there are 3 clusters in this data
# Finding multi-collinearity - Bi-Variate Analysis
plt.figure(figsize=(25, 15))
ax = sns.heatmap(vehicle_df.corr(),annot = True, linewidths = 0.5, cmap="YlGnBu")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
# We see lot of multi-collinearity between features and ideal way to do better clustering would be to work on outliers
# and features selection before clustering
# Elongatedness is negatively correlated and highlhy collinear with most of the features
# Compactness, Circularity, distance circularity is positively correlated with most of the features
# pr.axis_rectangulariy, max_length_rectangularity, scaled_variance,
# Scaled_variance_1 and Scaled_radius_of_gyration +VE CORR
Cluster_error = []
Iterating values of k from 1 to 10 fit K means model Using inertia
#Finding optimal no. of clusters
from scipy.spatial.distance import cdist
clusters=range(1,10)
# Iterating from K = 1 to 10 and getting the SSE for each Kmeans
vehicle_df_features = vehicle_df.drop('class', axis=1)
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(vehicle_df_features)
prediction=model.predict(vehicle_df_features)
Cluster_error.append(sum(np.min(cdist(vehicle_df_features, model.cluster_centers_, 'euclidean'), axis=1)) / vehicle_df_features.shape[0])
Use Matplotlib to plot the scree plot - Note: Scree plot plots Errors vs the no of clusters
# Elbow plot to idetify the Optimal clusters where there is sharp decline in the SSE
plt.plot(clusters, Cluster_error, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
# Let us find the optimal value; K = 2
final_model=KMeans(2)
final_model.fit(vehicle_df_features)
prediction=final_model.predict(vehicle_df_features)
vehicle_df["GROUP"] = prediction
vehicle_df.boxplot(by = 'GROUP', figsize=(15, 30))
# when grouped to 2 vehicle types we see outliers in most of the parameters which indicates further hidden pattern;
# hence, moving on to K = 3
# Let us find the optimal value; K = 3
final_model=KMeans(3)
final_model.fit(vehicle_df_features)
prediction=final_model.predict(vehicle_df_features)
vehicle_df["GROUP"] = prediction
vehicle_df.boxplot(by = 'GROUP', figsize=(15, 30))
vehicle_df['GROUP'].value_counts().plot.bar(title='Vehicle Types')
print(vehicle_df['GROUP'].value_counts())
# Based on K= 3 grouping we have the following findings; unlabelled group 0- bus; group 1- car; group 2 - van
# circular, compact and distance circularity are higher for group 1 than group 0 and group 2;
# --> cars are more sophisticated than bus and Van
# Elongatedness is higher in group 0 than group 1 and 2;
# --> Buses are lengthier and elongated than car and van
# Hollows_ratio is higher for group 1 and group 2 and less for group 0; outliers found for group 1
# Max length aspect ratio is slightly higher for group 1 than group 0 and group 2
# Max length rectangularity is slightly higher for group 1 than group 0 and group 2
# Good dispersion of data; we can fairly decide the clustering is done better
# Let us find the optimal value; K = 4
final_model=KMeans(4)
final_model.fit(vehicle_df_features)
prediction=final_model.predict(vehicle_df_features)
vehicle_df["GROUP"] = prediction
vehicle_df.boxplot(by = 'GROUP', figsize=(15, 30))
vehicle_df['GROUP'].value_counts().plot.bar(title='Vehicle Types')
print(vehicle_df['GROUP'].value_counts())
# Based on K= 4 grouping we have the following findings; unlabelled group 0, 1, 2 and 3
# Dispersion of data for Group 3 is very less to draw meaningful conclusions; hence Optimal value of K can be 3
# We will verify that with Silhouette coefficient
# The below function is from Sklearn for the Silhouette scores - Cluster Analysis ( Kmeans)
from __future__ import print_function
%matplotlib inline
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
print(__doc__)
# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
X=vehicle_df_features.values
range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.Spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.Spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
c=colors)
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers[:, 0], centers[:, 1],
marker='o', c="white", alpha=1, s=200)
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()
# Based on the Dispersion of the Data and Silhouette scores for K =2,3,4,5 and 6; Optimal value of K will be 3
# Although the score for K=4 is bit higher; dispersion is better for K=3 with not much difference in the silhouette scores
# optimal value; K = 3; Silhouette score of 0.2904
final_model=KMeans(3)
final_model.fit(vehicle_df_features)
prediction=final_model.predict(vehicle_df_features)
vehicle_df["GROUP"] = prediction
Note: Since the data has more than 2 dimension we cannot visualize the data. As an alternative, we can observe the centroids and note how they are distributed across different dimensions
centroids = final_model.cluster_centers_
You can use kmeans.clustercenters function to pull the centroid information from the instance
col_names = vehicle_df_features.columns
c_df = pd.DataFrame(centroids, columns = col_names)
c_df.transpose()
Hint: Use pd.Dataframe function
vehicle_df['GROUP'] = final_model.labels_
vehicle_df.groupby('GROUP').count() # optimal value of K = 3
# Visual analysis of Clustering using Pairplot
sns.pairplot(vehicle_df, diag_kind = 'kde', hue = 'GROUP')
For Hierarchical clustering, we will create datasets using multivariate normal distribution to visually observe how the clusters are formed at the end
np.random.seed(101) # for repeatability of this dataset
a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[100,])
b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[50,])
c = np.random.multivariate_normal([10, 20], [[3, 1], [1, 4]], size=[100,])
a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[100,]) b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[50,]) c = np.random.multivariate_normal([10, 20], [[3, 1], [1, 4]], size=[100,])
X = np.concatenate((a, b, c),)
HC_df = pd.DataFrame()
HC_df['col1']=X[:,0]
HC_df['col2']=X[:,1]
print(HC_df.shape)
HC_df
# Scatter Matrix
plt.scatter(X[:,0], X[:,1], cmap = 'prism')
plt.show()
HC_df = HC_df.apply(zscore) # scaling the data
# Pairplot to showcase the distributions
sns.pairplot(HC_df, diag_kind = 'kde')
Use ward as linkage metric and distance as Eucledian
from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(HC_df, method='ward', metric='euclidean', optimal_ordering=True)
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z,leaf_rotation=90.0,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()
Hint: Use truncate_mode='lastp' attribute in dendrogram function to arrive at dendrogram
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z,leaf_rotation=90.0,leaf_font_size=10,truncate_mode='lastp')
plt.tight_layout()
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist #Pairwise distribution between data points
# cophenet index is a measure of the correlation between the distance of points in feature space and distance on dendrogram
c, coph_dists = cophenet(Z , pdist(HC_df))
# closer it is to 1, the better is the clustering
c, coph_dists
# Identifying the Maximum distance and plotting them in the Dendogram for finding optimal clusters
# Credits: https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
def fancy_dendrogram(*args, **kwargs):
max_d = kwargs.pop('max_d', None)
if max_d and 'color_threshold' not in kwargs:
kwargs['color_threshold'] = max_d
annotate_above = kwargs.pop('annotate_above', 0)
ddata = dendrogram(*args, **kwargs)
if not kwargs.get('no_plot', False):
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index or (cluster size)')
plt.ylabel('distance')
for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
x = 0.5 * sum(i[1:3])
y = d[1]
if y > annotate_above:
plt.plot(x, y, 'o', c=c)
plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
textcoords='offset points',
va='top', ha='center')
if max_d:
plt.axhline(y=max_d, c='k')
return ddata
max_d = 15 # setting the cut-off as 15 for the last 12 merged clusters to get optimal clusters(3)
fancy_dendrogram(
Z,
truncate_mode='lastp',
leaf_rotation=90.,
leaf_font_size=12.,
show_contracted=True,
annotate_above=10,
max_d=max_d, # plot a horizontal cut-off line
)
plt.show()
from scipy.cluster.hierarchy import fcluster
clusters = fcluster(Z, max_d, criterion='distance')
clusters # Based on fcluster we infer the maximum optimal cluster is 3
# Performing Agglomotrative clustering based on the Dendogram Analysis
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
model.fit(HC_df)
HC_df['labels'] = model.labels_
HC_df.groupby(["labels"]).count()
plt.figure(figsize=(10, 8))
plt.scatter(HC_df['col1'], HC_df['col2'], c=clusters, cmap='prism') # plot points with cluster dependent colors
plt.show()
# Thus we are able to see 3 different clusters for the distribution we started
sns.pairplot(HC_df, diag_kind='kde') # Pairplot to visualize the distribution after clustering of data